Analysis of clonal diversity (that is, how clone sizes are distributed for a given repertoire) in BCR repertoires collected from individuals with different immunological states.

Considerations

The number of clones discovered in a given repertoire is directly proportional to sequencing depth (the number of sequences sampled):

# number of clones versus number of sequences
seq_counts <- ddply(test_data, c("SampleType2", "PatientID"), summarise,
              n_sequences = length(Seq_ID), 
              n_clone = length(unique(CloneID)))
seq_counts <- seq_counts[which(seq_counts$SampleType2 != "Infected"), ]
seq_counts$SampleType2 <- factor(seq_counts$SampleType2,
                                 levels = c("Healthy", "COVID-19", "COVID-19Recovered", "Ebola",
                                            "RSV-I", "RSV-U", "YFVD28"))
ggplot(seq_counts, aes(x = n_sequences, y = n_clone)) +
  geom_smooth(method = "lm") + geom_point(aes(colour = SampleType2), size = 2) +
  scale_color_brewer(name = "Sample Type", type = "qual") +
  cowplot::theme_cowplot() + scale_y_log10(name = "Number of clones") +
  scale_x_log10(name = "Number of sequences")
## `geom_smooth()` using formula 'y ~ x'

This influences the choice of diversity metrics to quantify clone distributions for samples with very different sequencing depth, either:

N.B. in ecology, (species) richness refers to the number of species in the habitat, evenness refers to the distribution of the relative abundance of species observed. This paper by Victor Greiff & Sai Reddy explains and develops this idea very nicely.

A measure of (un)evenness known as “Gini coefficient” has been used recently to quantify clonal diversity (see this by Rachael Bashford-Rogers and this analysis of BCR repertoire in COVID ). See below (section “Diversity indices”) for quantification.

Visualise clone distribution

‘Patchwork’ plots to show size of clones, and V gene usage/breakdown by isotype & sample type etc.

Largest 5 clones for each individual. Show at most 100 clones per patchwork.

clones <- test_data[which(test_data$UseAsRef & test_data$V.DOMAIN.Functionality != "No rearrangement found"),
            c(which(colnames(test_data) %in% c("SampleType2", "PatientID", "CloneID",
                                               "Vfamily", "Vgene", "Dgene", "Jfamily")))]
# clone size by isotype
clone_count <- ddply(test_data, c("SampleType2", "PatientID", "CloneID"), summarise,
                     NumInClone = length(Seq_ID),
                     IgM_memory = sum(CellType == "IgM memory", na.rm = TRUE),
                     IgM_naive = sum(CellType == "Naive", na.rm = TRUE),
                     IgG1 = sum(Subclass == "IgG1", na.rm = TRUE),
                     IgG2 = sum(Subclass == "IgG2", na.rm = TRUE),
                     IgG3 = sum(Subclass == "IgG3", na.rm = TRUE),
                     IgG4 = sum(Subclass == "IgG4", na.rm = TRUE),
                     IgA1 = sum(Subclass == "IgA1", na.rm = TRUE),
                     IgA2 = sum(Subclass == "IgA2", na.rm = TRUE))

clones <- merge(clones, clone_count, by = c("SampleType2", "PatientID", "CloneID"),
                all.x = FALSE, all.y = FALSE, sort = FALSE)
clones <- clones[which(clones$SampleType2 != "Infected"), ]

# Normalise clone size into % repertoire
repertoire_size <- ddply(clones, c("SampleType2", "PatientID"), summarise,
                         repsize = sum(NumInClone))
clones <- merge(clones, repertoire_size, by = c("SampleType2", "PatientID"),
                sort = FALSE)

clones <- reshape2::melt(clones,
                         id.vars = c("SampleType2", "PatientID", "CloneID",
                                     "Vfamily", "Vgene", "Dgene", "Jfamily", "repsize"),
                         measure.vars = c("NumInClone", "IgM_memory", "IgM_naive", "IgG1", "IgG2",
                                          "IgG3", "IgG4", "IgA1", "IgA2"))
clones$PercInClone <- clones$value / clones$repsize
clones <- clones[which(clones$PercInClone > 0), ]

# Largest clones per sample type
clones <- merge(
  clones, 
  ddply(clones, c("SampleType2"), summarise,
        n = round(100 / length(unique(PatientID)), 0)), # number of clones to be taken for each sample-type
  by = "SampleType2"
)
top_bound <- ddply(clones, c("SampleType2", "PatientID", "variable"), summarise,
                   bound = sort(value, decreasing = TRUE)[min(length(unique(CloneID)), n)])
clones <- merge(clones, top_bound, by = c("SampleType2", "PatientID", "variable"), sort = FALSE)
clones$large <- (clones$value >= clones$bound)
saveRDS(clones, "all_clones_count.rds")

clones_large <- clones[which(clones$large),]
# random initialisation in 2D axes
clones_large$x <- runif(nrow(clones_large))
clones_large$y <- runif(nrow(clones_large))
# clones$logNumInClone <- log10(clones$NumInClone)

clones_large$variable <- factor(clones_large$variable,
                          levels = c("NumInClone", "IgM_naive", "IgM_memory", "IgG1", "IgG2",
                                     "IgG3", "IgG4", "IgA1", "IgA2"),
                          labels = c("all", "Naive", "IgM memory", "IgG1", "IgG2",
                                     "IgG3", "IgG4", "IgA1", "IgA2"))
# enforce largest 100 as in certain cases many clones have sizes right at the bound
clones_large <- split(clones_large, f = list(clones_large$SampleType2, clones_large$variable))
clones_large <- lapply(clones_large, function(tb) tb[1:100, ])
clones_large <- do.call("rbind", clones_large)
clones_large <- clones_large[which(!is.na(clones_large$PercInClone)), ]

plotPatchWork <- function(clone_tb, rep_tb = test_data, VgeneHighlight = NULL){
  if( is.null(VgeneHighlight) ){
    clone_tb$V <- clone_tb$Vfamily
    clone_tb$V <- factor(clone_tb$V, levels = paste("IGHV", 1:7, sep = ""))
    Vcols <- RColorBrewer::brewer.pal(7, "Accent")
  } else {
    clone_tb$V <- clone_tb$Vgene
    clone_tb$V <- sapply(clone_tb$V, function(v){
      if(v %in% VgeneHighlight) return(v) else return("others")
    })
    clone_tb$V <- factor(clone_tb$V, levels = c(VgeneHighlight, "others"))
    Vcols <- c(sample(scales::hue_pal()(length(VgeneHighlight))),
               "grey")
  }
  #tb <- merge(clone_tb,
  #            unique(rep_tb[, c("PatientID", "AgeGroup")]),
  #            sort = FALSE)
  tb <- clone_tb
  ggplot(tb) +
    geom_point(aes(x=x, y=y, fill = V, size = PercInClone),
               colour = "black", pch=21) +
    scale_fill_manual(values = Vcols, name = "V gene") +
    scale_size_continuous(range = c(0, 10), limits = c(0, 0.2),
                          breaks = c(0.0005, 0.005, 0.05, 0.15),
                          labels = scales::percent,
                          name = "% repertoire") +
    theme_void() + theme(axis.text = element_blank(),
                            axis.title = element_blank(),
                            axis.ticks = element_blank()) +
    guides(fill = guide_legend(override.aes = list(size=4)))
}

plotPatchWork(clones_large) + facet_grid(variable ~ SampleType2) + ggtitle("all - highlight V family") + 
  theme(panel.border = element_rect(fill = NA))

plotPatchWork(clones_large, 
              VgeneHighlight = c("IGHV1-69", "IGHV3-23", "IGHV3-30", "IGHV4-34", "IGHV4-39", "IGHV5-51")) +
  facet_grid(variable ~ SampleType2) + ggtitle(" all - highlight V genes") + 
  theme(panel.border = element_rect(fill = NA))

Different sequencing depth will bias the plot.

Distribution of sequencing depth across tumours:

summary( seq_counts$n_sequences )
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     836    5397   12040   18126   23074  105323

Sample 12,000 sequences (\(\approx\) median sequencing depth of donors) from each individual and select up to 100 clones with likelihood scaled to their clone sizes. Plot to check if bias can be minimised?

#Different 'depth' of repertoire which leads to different number of clones & different absolute/relative size of clones. 
clones_sampled <- test_data[which(test_data$UseAsRef & 
                                    test_data$V.DOMAIN.Functionality != "No rearrangement found"),
                            c(which(colnames(test_data) %in% c("PatientID", "CloneID",
                                                               "Vfamily", "Vgene", "Jfamily")))]
sampled_data <- split(test_data, f = list(test_data$SampleType2, test_data$PatientID), drop = TRUE)
sampled_data <- lapply(sampled_data, function(tb){
  if(nrow(tb) < 12000){ # 12,000 is the median sequencing depth across samples
    tb[sample(1:nrow(tb), size = 12000, replace = TRUE), ]
  } else{
    tb[sample(1:nrow(tb), size = 12000, replace = FALSE), ]
  }
})
sampled_data <- do.call("rbind", sampled_data)

# clone size by isotype
clone_count_sampled <- ddply(sampled_data, c("SampleType2", "PatientID", "CloneID"), summarise,
                     NumInClone = length(Seq_ID),
                     IgM_memory = sum(CellType == "IgM memory", na.rm = TRUE),
                     IgM_naive = sum(CellType == "Naive", na.rm = TRUE),
                     IgG1 = sum(Subclass == "IgG1", na.rm = TRUE),
                     IgG2 = sum(Subclass == "IgG2", na.rm = TRUE),
                     IgG3 = sum(Subclass == "IgG3", na.rm = TRUE),
                     IgG4 = sum(Subclass == "IgG4", na.rm = TRUE),
                     IgA1 = sum(Subclass == "IgA1", na.rm = TRUE),
                     IgA2 = sum(Subclass == "IgA2", na.rm = TRUE))
# just take time point 0

clones_sampled <- merge(clones_sampled, clone_count, by = c("PatientID", "CloneID"),
                        all.x = FALSE, all.y = FALSE, sort = FALSE)
clones_sampled <- clones_sampled[which(clones_sampled$SampleType2 != "Infected"), ]

# Normalise clone size into % repertoire
repertoire_size <- ddply(clones_sampled, c("SampleType2", "PatientID"), summarise,
                         repsize = sum(NumInClone))
clones_sampled <- merge(clones_sampled, repertoire_size, by = c("SampleType2", "PatientID"),
                        sort = FALSE)

clones_sampled <- reshape2::melt(clones_sampled,
                                 id.vars = c("SampleType2", "PatientID", "CloneID",
                                             "Vfamily", "Vgene", "Jfamily", "repsize"),
                                 measure.vars = c("NumInClone", "IgM_memory", "IgM_naive", "IgG1", "IgG2",
                                                  "IgG3", "IgG4", "IgA1", "IgA2"))
clones_sampled$PercInClone <- clones_sampled$value / clones_sampled$repsize
clones_sampled <- clones_sampled[which(clones_sampled$PercInClone > 0), ]

# Largest 100 clones per sample type
#clones <- clones[!clones$PatientID %in% c("YF189", "YF191", "YF192", "YF199", "YF200", "YF203"),]
clones_sampled <- merge(
  clones_sampled, 
  ddply(clones_sampled, c("SampleType2"), summarise,
        n = round(100 / length(unique(PatientID)), 0)), # number of clones to be taken for each sample-type
  by = "SampleType2"
)
top_bound <- do.call("rbind", lapply(unique(clones_sampled$SampleType2), function(x){
  do.call("rbind", lapply(unique(clones_sampled$PatientID), function(y){
    do.call("rbind", lapply(unique(clones_sampled$variable), function(z){
      tb <- clones_sampled[which(clones_sampled$SampleType2 == x &
                                   clones_sampled$PatientID == y &
                                   clones_sampled$variable == z), ]
      if(nrow(tb) == 0) return(data.frame())
      else if(nrow(tb) == 1) return(data.frame(SampleType2 = x, PatientID = y,
                                               variable = z, clones = tb$CloneID))
      else {
        clones <- sample(tb$CloneID, min(length(tb$CloneID), unique(tb$n)),
                         prob = tb$PercInClone)
        return(data.frame(SampleType2 = x, PatientID = y,
                          variable = z, clones = clones))
      }
    }))
  }))
}))

top_bound$large <- TRUE
clones_sampled <- merge(clones_sampled, top_bound, by.x = c("SampleType2", "PatientID", "variable", "CloneID"), 
                        by.y = c("SampleType2", "PatientID", "variable", "clones"), all.x = TRUE, sort = FALSE)
clones_sampled <- clones_sampled[which(clones_sampled$large),]
# random initialisation in 2D axes
clones_sampled$x <- runif(nrow(clones_sampled))
clones_sampled$y <- runif(nrow(clones_sampled))
clones_sampled$variable <- factor(clones_sampled$variable,
                                  levels = c("NumInClone", "IgM_naive", "IgM_memory", "IgG1", "IgG2",
                                             "IgG3", "IgG4", "IgA1", "IgA2"),
                                  labels = c("all", "Naive", "IgM memory", "IgG1", "IgG2",
                                             "IgG3", "IgG4", "IgA1", "IgA2"))

clones_sampled$SampleType2 <- factor(clones_sampled$SampleType2,
                                     levels = c("COVID-19", "COVID-19Recovered", "Ebola", "Healthy", 
                                                "RSV-I", "RSV-U", "YFVD28"))
plotPatchWork(clones_sampled) + facet_grid(variable ~ SampleType2) + ggtitle("sampled - highlight V family") + 
  theme(panel.border = element_rect(fill = NA))

plotPatchWork(clones_sampled, 
              VgeneHighlight = c("IGHV1-69", "IGHV3-23", "IGHV3-30", "IGHV4-34", "IGHV4-39", "IGHV5-51")) + 
  facet_grid(variable ~ SampleType2) + ggtitle(" sampled - highlight V genes") + 
  theme(panel.border = element_rect(fill = NA))

Diversity indices

Gini coefficient : 1 = completely uneven, 0 = completely even. Here plotted (1 - Gini) so that the higher the data point, the higher the diversity. One-way ANOVA and Dunnett post-hoc test (comparison against Healthy) depicted.

gini <- ddply(clones, c("SampleType2", "variable", "PatientID"), summarise, 
              gini_index = ifelse(length(value) < 3, NA, 1 - ineq::ineq(value)))
gini$SampleType2 <- factor(gini$SampleType2,
                           levels = c("Healthy", "COVID-19", "COVID-19Recovered", "Ebola", "RSV-I", "RSV-U",
                                      "YFVD28"))
#saveRDS(gini, "gini.rds")


# significance testing
doGiniTest <- function(gini_tb, sample_type_col = "SampleType2", 
                       category_col = "variable", split_variable_col = NA)
{
  if( is.na( split_variable_col ) ) variables <- 1 # no splitting
  else {
    variables <- unique( gini_tb[, split_variable_col] )
  }
  gini_test <- lapply(unique(gini_tb[, category_col]), function(x){
    require(multcomp)
    o <- lapply(variables, function(y){
      if( is.na( split_variable_col )){
        lm_tb <- gini_tb[which(gini_tb[, category_col] == x), ]
      } else {
        lm_tb <- gini_tb[which(gini_tb[, category_col] == x & gini_tb[, split_variable_col] == y), ]
      }
      if( nrow(lm_tb) == 0 ) return(data.frame())
      colnames(lm_tb)[ which(colnames(lm_tb) == sample_type_col) ] <- "xxx"
      if(nrow(lm_tb) > 0 & length(unique(lm_tb$xxx)) > 1 &
        sum(grepl("^Healthy", lm_tb$xxx)) > 0){
        onewayanova <- try( suppressWarnings(
          aov(
            as.formula( paste("gini_index", "xxx", sep = " ~ ")), data = lm_tb
          )), silent = TRUE)
        if( "aov" %in% class(onewayanova) ){
          p <- summary(onewayanova)[[1]][1,5]
          if( length(p) == 0 ) return(data.frame())
          if( is.nan(p) ) posthoc <- NA else {
            posthoc <- suppressWarnings( glht(onewayanova, linfct = mcp(xxx = "Dunnett")) )
            posthoc <- try( suppressWarnings( summary(posthoc) ), silent = TRUE )
            if( "try-error" %in% class(posthoc) ) return(data.frame())
            signif_comparison <- 1:length(posthoc$test$pvalues)#which(posthoc$test$pvalues < 0.05)
            if( length(signif_comparison) > 0){
              posthoc <- lapply(signif_comparison, function(y){
                data.frame(variable = x,
                          SampleType2 = unlist(strsplit(names(posthoc$test$coefficients)[y], split = " - "))[1],
                          pval = round(posthoc$test$pvalues[y], 4))
              })
              posthoc <- do.call("rbind", posthoc)
              if( !is.na( split_variable_col )){
                posthoc$split_variable <- y
              }
              return(posthoc)
            } else posthoc <- data.frame()
          }
        } else return(data.frame())
      } else return(data.frame())
    })
    do.call("rbind", o)
  })
  gini_test <- do.call("rbind", gini_test)
  rownames(gini_test) <- NULL
  if( is.na( split_variable_col )){
    gini_tb <- merge(gini_tb, gini_test, by = c(sample_type_col, category_col), all.x = TRUE, sort = FALSE)
  } else {
    gini_tb <- merge(gini_tb, gini_test, by.x = c(split_variable_col, sample_type_col, category_col), 
                     by.y = c("split_variable", sample_type_col, category_col), all.x = TRUE, sort = FALSE)
  }
  gini_tb$sig <- (gini_tb$pval < 0.05)
  gini_tb$sig[which(is.na(gini_tb$sig))] <- FALSE
  gini_tb  
}
gini <- doGiniTest(gini_tb = gini)
## Loading required package: multcomp
## Loading required package: mvtnorm
## Loading required package: survival
## Loading required package: TH.data
## Loading required package: MASS
## 
## Attaching package: 'TH.data'
## The following object is masked from 'package:MASS':
## 
##     geyser
gini$variable <- factor(gini$variable,
                        levels = c("NumInClone", "IgM_memory", "IgM_naive", "IgG1", "IgG2",
                                   "IgG3", "IgG4", "IgA1", "IgA2"),
                        labels = c("all", "IgM memory", "Naive", "IgG1", "IgG2",
                                   "IgG3", "IgG4", "IgA1", "IgA2"))

# reference lines showing median gini for healthy samples just for easy comparisons
healthy_lvl <- ddply(gini[which(gini$SampleType2 == "Healthy"), ], c("variable"), 
                     summarise, median_gini = median(gini_index, na.rm = TRUE))
ggplot(gini, aes(x = SampleType2, y = gini_index, colour = sig)) + geom_boxplot() + geom_point() + 
  xlab("") + ylab("1 - Gini coefficient") + 
  geom_hline(data = healthy_lvl, aes(yintercept = median_gini), linetype = "dashed", colour = "grey") +
  scale_colour_manual(name = "p < 0.05", values = c("TRUE" = "red", "FALSE" = "grey")) +
  facet_wrap(~ variable) + cowplot::theme_cowplot() + theme(axis.text.x = element_text(angle = 45, hjust = 1))
## Warning: Removed 16 rows containing non-finite values (stat_boxplot).
## Warning: Removed 16 rows containing missing values (geom_point).

specific to each gene

Split by V/D/J and combinations - which gene usage is more associated with polyclonality / monoclonality?

# split by Vgene
gini_v <- ddply(clones, c("Vgene", "SampleType2", "variable", "PatientID"), summarise, 
                gini_index = ifelse(length(value) < 3, NA, 1 - ineq::ineq(value)))
gini_v$SampleType2 <- factor(gini_v$SampleType2,
                             levels = c("Healthy", "COVID-19", "COVID-19Recovered", "Ebola", "RSV-I", "RSV-U",
                                        "YFVD28"))
gini_v <- doGiniTest(gini_v, split_variable_col = "Vgene")

# split by Dgene
gini_d <- ddply(clones, c("Dgene", "SampleType2", "variable", "PatientID"), summarise, 
                gini_index = ifelse(length(value) < 3, NA, 1 - ineq::ineq(value)))
gini_d <- gini_d[which(!is.na(gini_d$Dgene)), ]
gini_d$SampleType2 <- factor(gini_d$SampleType2,
                             levels = c("Healthy", "COVID-19", "COVID-19Recovered", "Ebola", "RSV-I", "RSV-U",
                                        "YFVD28"))
gini_d <- doGiniTest(gini_d, split_variable_col = "Dgene")

# split by Jfamily
gini_j <- ddply(clones, c("Jfamily", "SampleType2", "variable", "PatientID"), summarise, 
                gini_index = ifelse(length(value) < 3, NA, 1 - ineq::ineq(value)))
gini_j$SampleType2 <- factor(gini_j$SampleType2,
                             levels = c("Healthy", "COVID-19", "COVID-19Recovered", "Ebola", "RSV-I", "RSV-U",
                                        "YFVD28"))
gini_j <- doGiniTest(gini_j, split_variable_col = "Jfamily")

# split by V-J
gini_vj <- ddply(clones, c("Vgene", "Jfamily", "SampleType2", "variable", "PatientID"), summarise, 
                gini_index = ifelse(length(value) < 3, NA, 1 - ineq::ineq(value)))
gini_vj$SampleType2 <- factor(gini_vj$SampleType2,
                             levels = c("Healthy", "COVID-19", "COVID-19Recovered", "Ebola", "RSV-I", "RSV-U",
                                        "YFVD28"))
gini_vj$VJ <- gsub("IGH", "", paste(gini_vj$Vgene, gini_vj$Jfamily, sep = "."))
gini_vj <- doGiniTest(gini_vj, split_variable_col = "VJ")

# split by V-D
gini_vd <- ddply(clones, c("Vgene", "Dgene", "SampleType2", "variable", "PatientID"), summarise, 
                gini_index = ifelse(length(value) < 3, NA, 1 - ineq::ineq(value)))
gini_vd$SampleType2 <- factor(gini_vd$SampleType2,
                             levels = c("Healthy", "COVID-19", "COVID-19Recovered", "Ebola", "RSV-I", "RSV-U",
                                        "YFVD28"))
gini_vd$VD <- gsub("IGH", "", paste(gini_vd$Vgene, gini_vd$Dgene, sep = "."))
gini_vd <- doGiniTest(gini_vd, split_variable_col = "VD")

# split by D-J
gini_dj <- ddply(clones, c("Dgene", "Jfamily", "SampleType2", "variable", "PatientID"), summarise, 
                gini_index = ifelse(length(value) < 3, NA, 1 - ineq::ineq(value)))
gini_dj$SampleType2 <- factor(gini_dj$SampleType2,
                             levels = c("Healthy", "COVID-19", "COVID-19Recovered", "Ebola", "RSV-I", "RSV-U",
                                        "YFVD28"))
gini_dj$DJ <- gsub("IGH", "", paste(gini_dj$Dgene, gini_dj$Jfamily, sep = "."))
gini_dj <- doGiniTest(gini_dj, split_variable_col = "DJ")

# split by V-D-J
gini_vdj <- ddply(clones, c("Vgene", "Dgene", "Jfamily", "SampleType2", "variable", "PatientID"), summarise, 
                gini_index = ifelse(length(value) < 3, NA, 1 - ineq::ineq(value)))
gini_vdj$SampleType2 <- factor(gini_vdj$SampleType2,
                               levels = c("Healthy", "COVID-19", "COVID-19Recovered", "Ebola", "RSV-I", "RSV-U",
                                          "YFVD28"))
gini_vdj$VDJ <- gsub("IGH", "", paste(gini_vdj$Vgene, gini_vdj$Dgene, gini_vdj$Jfamily, sep = "."))
gini_vdj <- doGiniTest(gini_vdj, split_variable_col = "VDJ")

# combine all these tables together
gini_split <- list(
  gini_v, gini_d, gini_j, gini_vj, gini_vd, gini_dj, gini_vdj
)
gini_split[[1]]$category <- "V"
gini_split[[2]]$category <- "D"
gini_split[[3]]$category <- "J"
gini_split[[4]]$category <- "V-J"
gini_split[[5]]$category <- "V-D"
gini_split[[6]]$category <- "D-J"
gini_split[[7]]$category <- "V-D-J"
gini_split <- do.call("rbind", lapply(gini_split, function(tb){
  colnames(tb)[1] <- "split_variable"
  tb[, c("category", "SampleType2", "variable", "split_variable", "PatientID", "gini_index", "sig")]
}))
saveRDS(gini_split, "gini_diversity_splitByVDJ_ANOVA.rds")
gini_split <- readRDS('gini_diversity_splitByVDJ_ANOVA.rds')
# only select gene usage which show significant (p < 0.05) difference across sample types 
# in a given sequence subset to plot
gini_to_plot <- unique(gini_split[, c("category", "variable", "split_variable", "sig")])
gini_to_plot <- gini_to_plot[gini_to_plot$sig, ]
gini_to_plot$plot <- TRUE

# calculate mean per sample type
gini_split <- ddply(gini_split, c("category", "SampleType2", "variable", "split_variable", "sig"), 
                    summarise, mean_gini = mean(gini_index, na.rm = TRUE))
gini_split <- merge(gini_split, gini_to_plot[, which(colnames(gini_to_plot) != "sig")], 
                    by = c("category", "variable", "split_variable"), all.x = TRUE)
gini_split <- gini_split[!is.na(gini_split$plot), ]
gini_split <- merge(gini_split, gini_split[which(gini_split$SampleType2 == "Healthy"), 
                                           c("category", "variable", "split_variable", "mean_gini")],
                    by = c("category", "variable", "split_variable"))
gini_split$mean_gini <- gini_split$mean_gini.x - gini_split$mean_gini.y

# only include genes which were considered in the gene usage analysis
gene_usage_test <- "GraphPad results for Joseph/Bulk_test.csv"
gene_usage_test <- read.csv(gene_usage_test, stringsAsFactors = FALSE)
gene_usage_test$SampleType <- factor(gene_usage_test$SampleType,
                                     levels = c("COVID-19", "COVID-19 Recovered", "Ebola", 
                                                "RSV-I", "RSV-U", "YFVD28"),
                                     labels = c("COVID-19", "COVID-19Recovered", "Ebola", 
                                                "RSV-I", "RSV-U", "YFVD28"))
gene_usage_test$include <- TRUE
gini_split <- merge(gini_split, gene_usage_test[, c(-1, -2)], by.x = c("SampleType2", "split_variable"),
                    by.y = c("SampleType", "gene"), all.x = TRUE, all.y = FALSE, sort = FALSE)
gini_split <- gini_split[which(!is.na(gini_split$include)), ]
  
gini_split$variable <- factor(gini_split$variable,
                          levels = c("NumInClone", "IgM_naive", "IgM_memory", "IgG1", "IgG2",
                                     "IgG3", "IgG4", "IgA1", "IgA2"),
                          labels = c("all", "Naive", "IgM memory", "IgG1", "IgG2",
                                     "IgG3", "IgG4", "IgA1", "IgA2"))
gini_split$mean_gini <- replace(gini_split$mean_gini,
                                which(gini_split$mean_gini < -0.4), -0.4)
gini_split$mean_gini <- replace(gini_split$mean_gini,
                                which(gini_split$mean_gini > 0.4), 0.4)
# plot V genes
ggplot(gini_split[which(gini_split$category == "V" & gini_split$SampleType2 != "Healthy"), ], 
       aes(x = SampleType2, y = split_variable, fill = mean_gini, size = sig)) +
    geom_point(pch = 21) +
    scale_fill_gradient2(low = "blue", high = "red", midpoint = 0, mid = "white", 
                         breaks = c(-0.4, -0.2, 0.2, 0.4),
                         name = "mean diversity\n(difference\nfrom Healthy)") +
    scale_size_manual(values = c("TRUE" = 7, "FALSE" = 2),
                      breaks = c("TRUE", "FALSE"), name = "p < 0.05") +
    xlab("") + ylab("") + facet_wrap(~ variable, nrow = 1) +
    cowplot::theme_cowplot() + theme(axis.text.x = element_text(angle = 45, hjust = 1), legend.position = "bottom")

# plot D genes
ggplot(gini_split[which(gini_split$category == "D" & gini_split$SampleType2 != "Healthy"), ], 
       aes(x = SampleType2, y = split_variable, fill = mean_gini, size = sig)) +
    geom_point(pch = 21) +
    scale_fill_gradient2(low = "blue", high = "red", midpoint = 0, mid = "white", 
                         breaks = c(-0.4, -0.2, 0.2, 0.4),
                         name = "mean diversity\n(difference\nfrom Healthy)") +
    scale_size_manual(values = c("TRUE" = 7, "FALSE" = 2),
                      breaks = c("TRUE", "FALSE"), name = "p < 0.05") +
    xlab("") + ylab("") + facet_wrap(~ variable, nrow = 1) +
    cowplot::theme_cowplot() + theme(axis.text.x = element_text(angle = 45, hjust = 1), legend.position = "bottom")

# plot D genes
ggplot(gini_split[which(gini_split$category == "J" & gini_split$SampleType2 != "Healthy"), ], 
       aes(x = SampleType2, y = split_variable, fill = mean_gini, size = sig)) +
    geom_point(pch = 21) +
    scale_fill_gradient2(low = "blue", high = "red", midpoint = 0, mid = "white", 
                         breaks = c(-0.4, -0.2, 0.2, 0.4),
                         name = "mean diversity\n(difference\nfrom Healthy)") +
    scale_size_manual(values = c("TRUE" = 7, "FALSE" = 2),
                      breaks = c("TRUE", "FALSE"), name = "p < 0.05") +
    xlab("") + ylab("") + facet_wrap(~ variable, nrow = 1) +
    cowplot::theme_cowplot() + theme(axis.text.x = element_text(angle = 45, hjust = 1), legend.position = "bottom")

separated by age

Difference between age \(\leq50\) vs \(\geq60\) ?

# map age groups
age_groups <- unique(test_data[, c("PatientID", "Age", "SampleType2")])
age_groups <- age_groups[which(age_groups$SampleType2 %in% c("COVID-19", "Healthy")), ]
age_groups$AgeGroup <- sapply(as.numeric(age_groups$Age), function(x){
  if(is.na(x)) return(NA)
  if(x <= 50) return("leq50")
  if(x >= 60) return("geq60")
  return(NA)
})
age_groups <- age_groups[which(!is.na(age_groups$AgeGroup)), ]

clones <- merge(clones, age_groups, by = c("SampleType2", "PatientID"), all.x = TRUE, sort = FALSE )
clones <- clones[which(!is.na(clones$AgeGroup)), ]

clones <- split(clones, clones$AgeGroup)

gini_age <- lapply(clones, function(tb){
  # split by Vgene
  gini_v_age <- ddply(tb, c("Vgene", "SampleType2", "variable", "PatientID"), summarise, 
                  gini_index = ifelse(length(value) < 3, NA, 1 - ineq::ineq(value)))
  gini_v_age$SampleType2 <- factor(gini_v_age$SampleType2,
                               levels = c("Healthy", "COVID-19"))
  gini_v_age <- doGiniTest(gini_v_age, split_variable_col = "Vgene")

  # split by Dgene
  gini_d_age <- ddply(tb, c("Dgene", "SampleType2", "variable", "PatientID"), summarise, 
                  gini_index = ifelse(length(value) < 3, NA, 1 - ineq::ineq(value)))
  gini_d_age <- gini_d_age[which(!is.na(gini_d_age$Dgene)), ]
  gini_d_age$SampleType2 <- factor(gini_d_age$SampleType2,
                               levels = c("Healthy", "COVID-19"))
  gini_d_age <- doGiniTest(gini_d_age, split_variable_col = "Dgene")

  # split by Jfamily
  gini_j_age <- ddply(tb, c("Jfamily", "SampleType2", "variable", "PatientID"), summarise, 
                  gini_index = ifelse(length(value) < 3, NA, 1 - ineq::ineq(value)))
  gini_j_age$SampleType2 <- factor(gini_j_age$SampleType2,
                               levels = c("Healthy", "COVID-19"))
  gini_j_age <- doGiniTest(gini_j_age, split_variable_col = "Jfamily")

  # split by V-J
  gini_vj_age <- ddply(tb, c("Vgene", "Jfamily", "SampleType2", "variable", "PatientID"), summarise, 
                  gini_index = ifelse(length(value) < 3, NA, 1 - ineq::ineq(value)))
  gini_vj_age$SampleType2 <- factor(gini_vj_age$SampleType2,
                                levels = c("Healthy", "COVID-19"))
  gini_vj_age$VJ <- gsub("IGH", "", paste(gini_vj_age$Vgene, gini_vj_age$Jfamily, sep = "."))
  gini_vj_age <- doGiniTest(gini_vj_age, split_variable_col = "VJ")

  # split by V-D
  gini_vd_age <- ddply(tb, c("Vgene", "Dgene", "SampleType2", "variable", "PatientID"), summarise, 
                gini_index = ifelse(length(value) < 3, NA, 1 - ineq::ineq(value)))
  gini_vd_age$SampleType2 <- factor(gini_vd_age$SampleType2,
                                levels = c("Healthy", "COVID-19"))
  gini_vd_age$VD <- gsub("IGH", "", paste(gini_vd_age$Vgene, gini_vd_age$Dgene, sep = "."))
  gini_vd_age <- doGiniTest(gini_vd_age, split_variable_col = "VD")

  # split by D-J
  gini_dj_age <- ddply(tb, c("Dgene", "Jfamily", "SampleType2", "variable", "PatientID"), summarise, 
                  gini_index = ifelse(length(value) < 3, NA, 1 - ineq::ineq(value)))
  gini_dj_age$SampleType2 <- factor(gini_dj_age$SampleType2,
                               levels = c("Healthy", "COVID-19"))
  gini_dj_age$DJ <- gsub("IGH", "", paste(gini_dj_age$Dgene, gini_dj_age$Jfamily, sep = "."))
  gini_dj_age <- doGiniTest(gini_dj_age, split_variable_col = "DJ")

  # split by V-D-J
  gini_vdj_age <- ddply(tb, c("Vgene", "Dgene", "Jfamily", "SampleType2", "variable", "PatientID"), summarise, 
                  gini_index = ifelse(length(value) < 3, NA, 1 - ineq::ineq(value)))
  gini_vdj_age$SampleType2 <- factor(gini_vdj_age$SampleType2,
                                 levels = c("Healthy", "COVID-19"))
  gini_vdj_age$VDJ <- gsub("IGH", "", paste(gini_vdj_age$Vgene, gini_vdj_age$Dgene, gini_vdj_age$Jfamily, sep = "."))
  gini_vdj_age <- doGiniTest(gini_vdj_age, split_variable_col = "VDJ")

  # combine all these tables together
  gini_split_age <- list(
    gini_v_age, gini_d_age, gini_j_age, gini_vj_age, gini_vd_age, gini_dj_age, gini_vdj_age
  )
  gini_split_age[[1]]$category <- "V"
  gini_split_age[[2]]$category <- "D"
  gini_split_age[[3]]$category <- "J"
  gini_split_age[[4]]$category <- "V-J"
  gini_split_age[[5]]$category <- "V-D"
  gini_split_age[[6]]$category <- "D-J"
  gini_split_age[[7]]$category <- "V-D-J"
  gini_split_age <- do.call("rbind", lapply(gini_split_age, function(x){
    colnames(x)[1] <- "split_variable"
    x[, c("category", "SampleType2", "variable", "split_variable", "PatientID", "gini_index", "sig")]
  }))
  gini_split_age
})
gini_age[[1]]$AgeGroup <- "geq60"
gini_age[[2]]$AgeGroup <- "leq50"
gini_age <- do.call("rbind", gini_age)
saveRDS(gini_age, "gini_diversity_splitByAgeVDJ_ANOVA.rds")
gini_age <- readRDS("gini_diversity_splitByAgeVDJ_ANOVA.rds")

# only select gene usage which show significant (p < 0.05) difference across sample types 
# in a given sequence subset to plot
gini_to_plot <- unique(gini_age[, c("category", "variable", "split_variable", "sig")])
gini_to_plot <- gini_to_plot[gini_to_plot$sig, ]
gini_to_plot$plot <- TRUE

# calculate mean per sample type
gini_age <- ddply(gini_age, c("category", "AgeGroup", "SampleType2", "variable", "split_variable", "sig"), 
                    summarise, mean_gini = mean(gini_index, na.rm = TRUE))
gini_age <- merge(gini_age, gini_to_plot[, which(colnames(gini_to_plot) != "sig")], 
                    by = c("category", "variable", "split_variable"), all.x = TRUE)
gini_age <- gini_age[!is.na(gini_age$plot), ]
gini_age <- merge(gini_age, gini_age[which(gini_age$SampleType2 == "Healthy"), 
                                           c("AgeGroup", "category", "variable", "split_variable", "mean_gini")],
                    by = c("category", "AgeGroup", "variable", "split_variable"))
gini_age$mean_gini <- gini_age$mean_gini.x - gini_age$mean_gini.y

# only include genes which were considered in the gene usage analysis (5 times rule etc)
gene_usage_test <- "GraphPad results for Joseph/Bulk_test.csv"
gene_usage_test <- read.csv(gene_usage_test, stringsAsFactors = FALSE)
gene_usage_test$SampleType <- factor(gene_usage_test$SampleType,
                                     levels = c("COVID-19", "COVID-19 Recovered", "Ebola", 
                                                "RSV-I", "RSV-U", "YFVD28"),
                                     labels = c("COVID-19", "COVID-19Recovered", "Ebola", 
                                                "RSV-I", "RSV-U", "YFVD28"))
gene_usage_test$include <- TRUE
gini_age <- merge(gini_age, gene_usage_test[, c(-1, -2)], by.x = c("SampleType2", "split_variable"),
                    by.y = c("SampleType", "gene"), all.x = TRUE, all.y = FALSE, sort = FALSE)
gini_age <- gini_age[which(!is.na(gini_age$include)), ]
  
gini_age$variable <- factor(gini_age$variable,
                          levels = c("NumInClone", "IgM_naive", "IgM_memory", "IgG1", "IgG2",
                                     "IgG3", "IgG4", "IgA1", "IgA2"),
                          labels = c("all", "Naive", "IgM memory", "IgG1", "IgG2",
                                     "IgG3", "IgG4", "IgA1", "IgA2"))
gini_age$mean_gini <- replace(gini_age$mean_gini,
                                which(gini_age$mean_gini < -0.4), -0.4)
gini_age$mean_gini <- replace(gini_age$mean_gini,
                                which(gini_age$mean_gini > 0.4), 0.4)
gini_age$AgeGroup <- factor(gini_age$AgeGroup, levels = c("leq50", "geq60"))
# plot V genes
ggplot(gini_age[which(gini_age$category == "V" & gini_age$SampleType2 != "Healthy"), ], 
       aes(x = AgeGroup, y = split_variable, fill = mean_gini, size = sig)) +
    geom_point(pch = 21) + scale_x_discrete(name = "Age", labels= c(expression(""<=50), expression("">=60))) +
    scale_fill_gradient2(low = "blue", high = "red", midpoint = 0, mid = "white", 
                         breaks = c(-0.4, -0.2, 0.2, 0.4),
                         name = "mean diversity\n(difference\nfrom Healthy)") +
    scale_size_manual(values = c("TRUE" = 7, "FALSE" = 2),
                      breaks = c("TRUE", "FALSE"), name = "p < 0.05") +
    xlab("") + ylab("") + facet_wrap(~ variable, nrow = 1) +
    cowplot::theme_cowplot() + theme(axis.text.x = element_text(angle = 45, hjust = 1), legend.position = "bottom")

# plot D genes
ggplot(gini_age[which(gini_age$category == "D" & gini_age$SampleType2 != "Healthy"), ], 
       aes(x = AgeGroup, y = split_variable, fill = mean_gini, size = sig)) +
    geom_point(pch = 21) + scale_x_discrete(name = "Age", labels= c(expression(""<=50), expression("">=60))) +
    scale_fill_gradient2(low = "blue", high = "red", midpoint = 0, mid = "white", 
                         breaks = c(-0.4, -0.2, 0.2, 0.4),
                         name = "mean diversity\n(difference\nfrom Healthy)") +
    scale_size_manual(values = c("TRUE" = 7, "FALSE" = 2),
                      breaks = c("TRUE", "FALSE"), name = "p < 0.05") +
    xlab("") + ylab("") + facet_wrap(~ variable, nrow = 1) +
    cowplot::theme_cowplot() + theme(axis.text.x = element_text(angle = 45, hjust = 1), legend.position = "bottom")

# plot D genes
ggplot(gini_age[which(gini_age$category == "J" & gini_age$SampleType2 != "Healthy"), ], 
       aes(x = AgeGroup, y = split_variable, fill = mean_gini, size = sig)) +
    geom_point(pch = 21) + scale_x_discrete(name = "Age", labels= c(expression(""<=50), expression("">=60))) +
    scale_fill_gradient2(low = "blue", high = "red", midpoint = 0, mid = "white", 
                         breaks = c(-0.4, -0.2, 0.2, 0.4),
                         name = "mean diversity\n(difference\nfrom Healthy)") +
    scale_size_manual(values = c("TRUE" = 7, "FALSE" = 2),
                      breaks = c("TRUE", "FALSE"), name = "p < 0.05") +
    xlab("") + ylab("") + facet_wrap(~ variable, nrow = 1) +
    cowplot::theme_cowplot() + theme(axis.text.x = element_text(angle = 45, hjust = 1), legend.position = "bottom")